In [29]:
from pyfasta import Fasta
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord

In [36]:
# Requires this file to be in the current working directory
all_trans = Fasta("Homo_sapiens.GRCh38.cdna.all.fa")

Get the first max_trans transcripts


In [14]:
min_len = 500
count = 0
max_trans = 10
trans_to_keep = {}
for tname in all_trans.keys():
    if count == max_trans:
        break
    if len(all_trans[tname]) > min_len:
        trans_to_keep[tname] = str(all_trans[tname])
        count += 1

Now output them


In [33]:
trans_recs = []
for tname in trans_to_keep.keys():
    trans_recs.append(SeqRecord(seq = Seq(trans_to_keep[tname]),
                                id = tname.split()[0],
                                description = ""))

In [34]:
out_fname = "{0}_trans_gt_{1}_bp.fasta".format(max_trans, min_len)

In [35]:
with open(out_fname, "w") as out_fhandle:
    SeqIO.write(trans_recs, out_fhandle, "fasta")

In [ ]: